#!pip install -U kaleido
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_columns', 100)
import missingno as msno
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
df_customers = pd.read_csv('Dataset/olist_customers_dataset.csv')
df_orders = pd.read_csv('Dataset/olist_orders_dataset.csv')
df_order_items = pd.read_csv('Dataset/olist_order_items_dataset.csv')
df_order_payments = pd.read_csv('Dataset/olist_order_payments_dataset.csv')
df_order_reviews = pd.read_csv('Dataset/olist_order_reviews_dataset.csv')
df_products = pd.read_csv('Dataset/olist_products_dataset.csv')
df_trad = pd.read_csv('Dataset/product_category_name_translation.csv')
df_sellers = pd.read_csv('Dataset/olist_sellers_dataset.csv')
df_geoloc = pd.read_csv('Dataset/olist_geolocation_dataset.csv')
datasets_dict = {"customers": df_customers, "orders": df_orders,
"order_items": df_order_items, "order_payments": df_order_payments,
"order_reviews": df_order_reviews,"products": df_products,
"sellers":df_sellers,"trad": df_trad, "geoloc": df_geoloc }
for item in datasets_dict:
print("**")
print("Le jeu de données %s contient %d lignes et %d colonnes." % (item, datasets_dict[item].shape[0], datasets_dict[item].shape[1]))
for item in datasets_dict:
print("###########################################################")
print(item)
print(datasets_dict[item].columns)
Dans les tables, on peut voir que
- "df_orders" est reliée à la table "df_customers" par la clé "customer_id".
- "order_items", "order_payments" et "order_reviews" sont relié à "orders" par la clé "order_id"
- "products" et "trad" sont relié par "product_category_name"
- "order_items" et "products" sont relié par "product_id"
- "order_items" et ""sellers" sont relié par "seller_id"
On va donc les fusionner.
Visualisation de valeurs manquantes
for df in datasets_dict:
msno.matrix(datasets_dict[df])
plt.title("Répartition de valeur manquantes {}".format(df))
plt.xlabel("Nom des colonnes")
plt.ylabel("Nombre de lignes")
df_customers.shape
df_customers.head()
df_customers.shape
df_geoloc.shape
df_geoloc.head()
# Doublons ?
df_geoloc['geolocation_zip_code_prefix'].nunique()
df_geoloc[df_geoloc['geolocation_zip_code_prefix']== 1041].head()
On remarque qu'à un même code postal sont associées différents localisations ('geolocation_lat', 'geolocation_lng'). Nous allons prendre la moyenne des différentes positions géographiques correspondant à un même code postal.
# Regrouper les latitudes/longitudes par leurs moyennes
df_geoloc = df_geoloc.groupby('geolocation_zip_code_prefix') \
.agg({'geolocation_lat': 'mean', 'geolocation_lng': 'mean',
'geolocation_city': 'first', 'geolocation_state': 'first'}) \
.reset_index()
# Vérification
df_geoloc['geolocation_zip_code_prefix'].nunique()
df_geoloc.head()
df_geoloc.geolocation_zip_code_prefix.unique()
def join_dfs(df1:pd.DataFrame, df2:pd.DataFrame, key:str, how:str='left') -> pd.DataFrame:
print(f"{'The shape of df1:':<22}", df1.shape)
print(f"{'The shape of df2:':<22}", df2.shape)
df = df1.merge(df2, how=how, on=key)
print("The shape of final df:", df.shape)
return df
Jointure avec la table 'df_customers'
# change key name of geolocation
df_geoloc.rename(
columns={"geolocation_zip_code_prefix": "customer_zip_code_prefix"}, inplace=True
)
# join consumers info dataframes
df_customers = join_dfs(
df_customers,
df_geoloc,
key="customer_zip_code_prefix",
how="left",
)
df_customers.head()
df_customers[["customer_city", "geolocation_city"]][
~(df_customers.customer_city == df_customers.geolocation_city)
].sample(10)
df_customers[["customer_state", "geolocation_state"]][
~(df_customers.customer_state == df_customers.geolocation_state)
].sample(5)
Y a une différence d'accent pour les 'city' et les 'state' contiennent quelques valeurs manquantes. On va garder celles de 'customer'
fig = px.scatter_mapbox(df_customers,
lat="geolocation_lat",
lon="geolocation_lng",
mapbox_style = "open-street-map",
hover_data = ["geolocation_state"],
zoom=3,
title="Répartition des clients selon leurs états")
fig.show()
fig.write_image("plots/clients_etat.png")
Le ecommerce brésilien a atteint le Portugal
# drop useless columns
df_customers.drop(columns=['geolocation_city', 'geolocation_state'], inplace=True)
df_customers.shape
df_orders.head()
df_orders.isna().sum()
df_orders.order_status.unique()
df_orders.info()
df_orders=pd.read_csv('Dataset/olist_orders_dataset.csv',parse_dates=['order_purchase_timestamp','order_approved_at','order_delivered_carrier_date','order_delivered_customer_date','order_estimated_delivery_date'])
df_orders.info()
df_orders = join_dfs(df_orders, df_order_payments, key="order_id", how="left")
df_orders = join_dfs(df_orders, df_order_reviews, key="order_id", how="left")
df_orders = join_dfs(df_orders, df_order_items, key="order_id", how="left")
df_orders.head()
df = pd.DataFrame()
sum_missing_values = df_orders.isnull().sum()
percentage = 100 * sum_missing_values / len(df_orders)
df['% Missing Values'] = percentage.round(2)
df.style.background_gradient(axis=0)
df_orders.drop_duplicates(subset="order_id", keep="last", inplace=True)
# join customers info dataframes df_orders
df_customers = join_dfs(df_customers, df_orders, key="customer_id", how="left")
df_customers.head()
# Fusion du nouveau dataframe avec "products" via la clé "product_id"
df_customers = join_dfs(df_customers, df_products, key="product_id", how="left")
df_customers.head()
# Fusion du nouveau dataframe avec "trad"
df_customers = join_dfs(df_customers, df_trad, key="product_category_name", how="left")
df_customers.head()
df_customers = join_dfs(df_customers, df_sellers, key="seller_id", how="left")
df_customers.head()
df_customers.drop(columns='seller_zip_code_prefix', inplace=True)
df_customers.shape
df_customers.to_csv(('Dataset/olist_merged.csv'))
olist_df = pd.read_csv('Dataset/olist_merged.csv', index_col=0,parse_dates=['order_purchase_timestamp','order_approved_at','order_delivered_carrier_date','order_delivered_customer_date','order_estimated_delivery_date','review_creation_date','review_answer_timestamp','shipping_limit_date'])
olist_df.shape
olist_df.head()
olist_df.describe()
olist_df[olist_df.duplicated()]
Aucun doublon
olist_df[olist_df['review_id'].isnull() & olist_df['review_score'].isnull()]
# Suppression des variables inutiles pour notre étude
olist_df = olist_df.drop(['review_comment_title','review_comment_message',
'review_creation_date','review_answer_timestamp',
'review_id','seller_id'],axis=1)
olist_df[olist_df['geolocation_lng'].isnull() & olist_df['geolocation_lat'].isnull()]
msno.matrix(olist_df)
plt.figure(figsize=(7, 5))
sns.countplot(y="review_score", data=olist_df)
plt.title("Répartition des scores des avis clients")
plt.ylabel("Score")
plt.show()
totalOrdersByState = olist_df.groupby('customer_state')['order_id'].nunique().sort_values(ascending=False)
totalOrdersByState
plt.figure(figsize=(15,5))
plt.title('total des commandes par Etats des clients')
sns.barplot(x=totalOrdersByState.index, y=totalOrdersByState)
Une grande partie des commandes se font depuis l'état de Sao Paolo. Ce qui est tout à fait possible car c'est l'état le plus peuplé du Brésil.
olist_df[olist_df.customer_state.str.contains('SP')]['customer_state'].value_counts()
plt.figure(figsize=(7, 5))
sns.countplot(y="order_status", data=olist_df)
plt.title("Répartition des statuts de commandes")
plt.ylabel("order_status")
plt.show()
Valeurs aberrantes
olist_df.info()
# Date de commande > date à de paiement
olist_df[(olist_df['order_purchase_timestamp'] > olist_df['order_delivered_carrier_date'])]
olist_df = olist_df[~(olist_df['order_purchase_timestamp'] > olist_df['order_delivered_carrier_date'])]
olist_df.shape
# Date de commande > date à laquelle la commande a été traitée
olist_df[(olist_df['order_purchase_timestamp'] > olist_df['order_approved_at'])]
# Date de commande > date de livraison
olist_df[(olist_df['order_purchase_timestamp'] > olist_df['order_delivered_customer_date'])]
Outliers
integer_col = olist_df.select_dtypes(include=[np.float, np.integer]).columns
integer_col[1::]
# Figure
plt.figure(figsize = (11,7))
x = olist_df[integer_col[1::]]
sns.set(style='whitegrid')
sns.boxplot(data=x)
# Add labels
plt.title("Distribution de quelques valeurs numériques clients")
plt.xlabel("")
plt.ylabel('Données clients')
plt.xticks(rotation=45)
plt.show()
plt.figure(figsize=(7, 5))
sns.countplot(y="product_photos_qty", data=olist_df)
plt.title("Nombre de photos par produits")
plt.ylabel("product_photos_qty")
plt.show()
payment = olist_df.groupby(['payment_type']).count()[
'customer_city'].sort_values(ascending=False)
plt.figure(figsize=(10,4))
plt.title('Répartition des moyens de paiement')
sns.barplot(x = payment.values,
y = payment.index)
plt.figure(figsize=(10,10))
sns.heatmap(olist_df.corr())
integer_col = olist_df.select_dtypes(include=[np.float, np.integer]).columns
# Distribution des variables quantitatives
for n in range(olist_df[integer_col].shape[1]):
plt.figure(figsize=(16, 8))
plot = sns.histplot(x=olist_df[integer_col].iloc[:, n], bins=100)
plt.title(f"Distribution de {olist_df[integer_col].iloc[:, n].name}",
fontsize=16)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.savefig(f"{olist_df[integer_col].iloc[:, n].name}_dist.png",
bbox_inches='tight')
plt.show()
olist_df.columns
olist_df.shape
prod = olist_df[olist_df['product_name_lenght']>65]
prod.shape
fig = px.pie(prod, values='product_name_lenght', names='product_category_name',title='Top 5 categories par leur taille de nom')
fig.show()
fig.write_image("plots/product_length.png")
fig = px.pie(prod, values='product_name_lenght', names='product_category_name_english',title='Top 5 categories par leur taille de nom')
fig.show()
fig.write_image("plots/product_En_length.png")
px.pie(prod, values='product_name_lenght', names='payment_value',title='Répatition des produits par leur taille et prix')
plt.figure(figsize=(20, 10))
sns.heatmap(abs(olist_df[integer_col].corr()), annot=True, cmap="YlGnBu",
annot_kws={"fontsize":12})
plot.set_xlabel(' ', fontsize=12)
plot.set_ylabel(f" ", fontsize=12)
plot.tick_params(labelsize=12)
plt.savefig(f"plots/corr_table_plot.png", bbox_inches="tight")
cat_col = olist_df.select_dtypes(include=[np.object]).columns
cat_col = ['customer_state','order_status', 'payment_type', 'product_category_name','product_category_name_english', 'seller_state']
for n in range(olist_df[cat_col].shape[1]):
plt.figure(figsize=(10, 20))
plot = sns.countplot(y=olist_df[cat_col].iloc[:, n], palette='Blues_r')
plt.title(f"Distribution de {olist_df[cat_col].iloc[:,n].name}",
fontsize=12)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()
colASupprimer = ['product_category_name', 'product_name_lenght',
'product_description_lenght', 'product_photos_qty', 'product_weight_g',
'product_length_cm', 'product_height_cm', 'product_width_cm']
olist_df.drop(colASupprimer, axis=1, inplace=True)
olist_df = olist_df.rename(columns={'product_category_name_english': 'product_category_name'})
plt.figure(figsize=(16, 10))
plt.title('Nombre de produit vendu/acheté par catégorie', fontsize = 20)
sns.barplot(x = olist_df['product_category_name'].value_counts().index,
y = olist_df['product_category_name'].value_counts().values)
plt.xlabel("Nom des différentes catégories", fontsize = 12)
plt.ylabel("Nombre de produit", fontsize = 12)
plt.tick_params(axis = 'both', labelsize = 12)
plt.xticks(rotation=90)
plt.show()
print("Number of unique categories: ", len(olist_df.product_category_name.unique()))
plt.figure(figsize=(10,6))
top_25_prod_categories = olist_df.groupby('product_category_name')['product_id'].count().sort_values(ascending=False).head(25)
sns.barplot(x=top_25_prod_categories.index, y=top_25_prod_categories.values)
plt.xticks(rotation=90)
plt.xlabel('Product Category')
plt.title('Top 25 des catégories');
plt.show()
temp_col = olist_df.select_dtypes(exclude=[np.object, np.integer, np.float]).columns
for n in range(olist_df[temp_col].shape[1]):
plt.figure(figsize=(16, 10))
plot = sns.histplot(x=olist_df[temp_col].iloc[:, n], bins=100)
plt.title(f"Distribution de {olist_df[temp_col].iloc[:, n].name}",
fontsize=20)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.savefig(f"{olist_df[temp_col].iloc[:, n].name}_dist.png",
bbox_inches='tight')
plt.show()
olist_df.columns
start_date = '31-12-2018'
mask = (olist_df['shipping_limit_date'] > start_date)
df = olist_df.loc[mask]
df
df = pd.DataFrame()
sum_missing_values = olist_df.isnull().sum()
percentage = 100 * sum_missing_values / len(olist_df)
df['% Missing Values'] = percentage.round(2)
df.style.background_gradient(axis=0)
Le taux des valeurs manquantes est faible , nous pouvons supprimer les lignes contenant des valeurs manquante.
olist_df = olist_df.dropna()
olist_df.shape
olist_df["review_score"] = olist_df["review_score"].astype("int32")
olist_df[olist_df['order_status'] == 'delivered'].shape[0]/ olist_df.shape[0] * 100
On remarque qu'il y a une petite pourcentage de commande qui n'ont pas la statut délivré. On va donc se centré que sur les produits délivré.
olist_df = olist_df[olist_df['order_status'] == 'delivered']
olist_df.describe().transpose()
# Plot categories count in dataset
fig = plt.figure(figsize=(18, 8))
sns.countplot(data=olist_df, x="product_category_name")
plt.xticks(range(0, olist_df["product_category_name"].nunique()),
olist_df["product_category_name"].unique(),
rotation=90)
plt.title(f"Les catégories produits les plus représentées\n")
plt.show()
On va regrouper plusieurs catégories en une catégorie
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('fashio|watch')] = 'Fashion'
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('health|beauty|perfum')] = 'Beauty'
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('home|furniture|garden|bath|house|cuisine|flower')] = 'Home Decor'
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('book')] = 'Books'
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('phon|compu|tablet|electro|consol|security|Tech|appliance|air')] = 'Electronics and Tech'
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('baby|diaper')] = 'Baby'
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('grocer|food|drink')] = 'Food and drinks'
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('media|cd|audio|musi|dvd|cine')] = 'Media'
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('art')] = 'Art'
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('construction|costruction')] = 'Construction'
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('commerce|market_place')] = 'Industry and Commerce'
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('auto|stati|chris|cool|Art')] = 'Miscellaneous'
# Plot categories count in dataset
fig = plt.figure(figsize=(18, 8))
sns.countplot(data=olist_df, x="product_category_name")
plt.xticks(range(0, olist_df["product_category_name"].nunique()),
olist_df["product_category_name"].unique(),
rotation=90)
plt.title(f"Les catégories produits")
plt.show()
Analyse sur un customer
cust_many_orders = olist_df[olist_df['customer_unique_id'] == '8d50f5eadf50201ccdcedfb9e2ac8455']
cust_many_orders
fig = px.scatter(cust_many_orders, x='product_category_name', y='price'
,title='Customer with most purchase as per product category price')
fig.show()
fig.write_image("plots/one_cust_purchase.png")
fig = px.scatter(cust_many_orders, x='order_delivered_customer_date', y='order_purchase_timestamp'
,title='Customer with most purchase delivery dates')
fig.show()
fig.write_image("plots/one_cust_purchase_time.png")
olist_df.to_csv('Dataset/olist_final_cleaned.csv')